library("dplyr")
library ("ggplot2")
library ("stringr")
library ("plotly")
library("leaflet")
library("rjson")
## Warning: package 'rjson' was built under R version 4.1.2
incarceration <- read.csv("https://raw.githubusercontent.com/vera-institute/incarceration-trends/master/incarceration_trends.csv")

Introduction + Summary Information

Question: 1, What is the average value of aapi population in jail across all the counties in all years? 2, When is aapi population in jail the highest? 3, When is aapi population in jail the lowest? 4, How much has aapi population in jail change over the last 10 years? 5, What is the standard division of the aapi population in jail?

The variable that I choose is the population that Asian American / Pacific Islander people are in jail. The unit is 0.01, so the number is 100 smaller than the true number of people in jail. I also ask for the mean of this value, since the mean can tell us what is the average number of people that are in jail in all these years. The mean is about 1.98, this should be 198 people in the real world which is not a huge number and means near 200 Asian American / Pacific Islander people may be in jail every year. I also ask for the year that has max and min population. I have found that for the max value, there is only 1 year. But for the min value which is 0, there are many possible years and the years have repeated. So I have to find unique years. I also find the changes in population in recent 10 years. Since the data is in a really early time I do not think they can represent the thought and actions of people in recent times. So I want to see what will be the changes in these 10 years and how can they change. For the std. dev, this can show us how is the data changes. Whether is data is stable or not. We can see the SD is 14.441 which for me I think the data is not stable.

# the average value of aapi population in jail
mean_aapi_jail_pop <- mean(incarceration$aapi_jail_pop,na.rm = TRUE)

# when is the max of the value of aapi population in jail
max_aapi_jail_pop <- max(incarceration$aapi_jail_pop,na.rm = TRUE)
time_max_aapi_jail_pop <- incarceration%>%
  filter(aapi_jail_pop == max_aapi_jail_pop)%>%
  pull(year)

# when is the min of the value of aapi population in jail
min_aapi_jail_pop <- min(incarceration$aapi_jail_pop,na.rm = TRUE)
time_min_aapi_jail_pop_base <- incarceration%>%
  filter(aapi_jail_pop == min_aapi_jail_pop)%>%
  pull(year)
time_min_aapi_jail_pop <- unique(time_min_aapi_jail_pop_base)

# the changes of the value of aapi population in jail from 2008-2018
aapi_pop_10 <- incarceration %>%
  filter(year >= 2008 & year <= 2018)
max_aapi_pop_10 <- max(aapi_pop_10$aapi_jail_pop,na.rm = TRUE)
min_aapi_pop_10 <- min(aapi_pop_10$aapi_jail_pop,na.rm = TRUE)

change_aapi_jail_pop <- max_aapi_pop_10 - min_aapi_pop_10

# the std. dev of the value of aapi population from in jail
SD_aapi_jail_pop <- sd(incarceration$aapi_jail_pop,na.rm = TRUE)

Variable comparison chart

For this plot, I have chosen the relation between black people population in jail and white people population in jail. Also, I have shown the year by color in the graph. For lighter colors, the year is more recent. The reason I choose these two data is that I have the relation between time and white people in jail from the upper question. I think comparing with white and black with white’s trend will be easier to see. I use scatter polt since this can show how the relation is distributed. As we can see that most of the point is at the left down corner which means all population is low. Connecting with the upper graph. We can see for the point the white population is high, the year is always 2017 and 2004 or around them. This is really important that tells us what may be the graph for black people and time will be having peaks at or around 2001 and 2014. We can gain this relation without displaying the graph which is convenient.

scatter <- ggplot(incarceration_NA) +
  geom_point(mapping = aes(x = black_jail_pop, y = white_jail_pop, color = year)) +
  labs(
    title = "white people population and black people population in jail in west region realation",
    x = "black people population in jail in west region",
    y = "white people population in jail in west region",
  )
ggplotly(scatter)

Map

For this plot, I have shown the map for the US and it contains the black population in jail in the year of 2018. I want to see the distribution of the population ins 2018 with different locations. I have used white and black people in the upper question and white for the first graph, so I decide to use the black population in jail this time. I made the map interactive which means people can see the fips while seeing the population at the same time. This can help us to see the location and the population at the same time. Maybe there will be a relationship between location and population. I also use the darkness of the color to show the amount of population, with lighter color the population will be larger. From the map, we can see that the place has a large population is not that much. There are only a few of those places.

url <- 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
counties <- rjson::fromJSON(file=url)

incarceration_2018 <- incarceration %>%
  filter(na.rm = TRUE, 
         year==2018)

g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

fig <- plot_ly()
fig <- fig %>% add_trace(
  type="choropleth",
  geojson=counties,
  locations = incarceration_2018$fips,
  z=incarceration_2018$black_jail_pop,
  colorscale="Viridis",
  marker=list(line=list(
    width=0)
  )
)
fig <- fig %>% colorbar(title = "population black people in jail in 2018")
## Warning: Ignoring 177 observations
fig <- fig %>% layout(
  title = "2018 distribution of black population in jail"
)
fig <- fig %>% layout(
  geo = g
)

fig